#!/bin/bash
#
# Description: Gather basic HMC problem debugging information
#
# Change Activity:
#   05/22/2003 L. Brocious    - Initial version
#   08/18/2004 L. Brocious    - Serious enhancements and some corrections
#                               . Added command line options
#                               . Use DFC lookup for file locations when possible
#                               . Wait for JVM dumps to complete
#   08/20/2004 L. Brocious    - Add output from ps command to capture full command
#                               line of each process
#
#STARTUSAGE
#
# This script gathers some basic information about the processes running
# on the system, including new JVM thread/heap dumps of the HMC/CCFW JVM,
# as well as other basic HMC debugging information, and creates a
# compressed tar file containing that information.  This script will
# determine whether a zSeries or pSeries HMC is running. This script 
# assumes that the JVM is the IBM JVM.  You must be logged in as root
# to run this script.
#
# It gathers debugging information such as:
# . Output from commands such as top, ps, pstree and free
# . HMC log file and compressed log files
# . HMC trace file
# . JVM thread dump(s)
# . JVM heap dump(s)
# . HMC linemode console output
#
# Usage:
#    hmcdebuginfo [-j][-q][-?]
# -j option ("alljvmdumps") causes all JVM thread dumps and heap dumps
#          to be included, not just the new dumps initiated by this script.
# -q option ("quick") eliminates the wait for the JVM thread/heap dumps
#          if the JVM does not acknowledge the dump request in a timely 
#          fashion.  This script continues without gathering JVM dumps.
#          This option is convenient when the JVM is not responsive at
#          all and you still wish to gather other debugging information.
# -? displays this usage information.
#
#ENDUSAGE
#****************************************************************************

# Function to return the PID of the top-level JVM process.  Returns nothing if
# the top-level JVM process cannot be found.
# Arguments
# . HMC script that starts the JVM
getjvmpid() {
   searchtarget="$1"   # The first thing to find is the script that starts the JVM
   # Loop through "ps -eH" output to find the PID of the top-level JVM process
   ps -eH | while read -r pid tty time cmd; do
      if [ "$cmd" == "$searchtarget" ]; then
         if [ "$searchtarget" == "java" ]; then    # Found the top-level JVM process
            echo $pid   # Return this process' PID to our caller
            break
         else
            # We found the script that starts the JVM; the next thing to look
            # for is the top-level java process
            searchtarget="java"
         fi
      fi
   done
}


# Set default values for options
quick=0
giveUsage=0
alljvmdumps=0

# Parse the options
while getopts 'jq?' optname; do
   case "$optname" in
      j)  alljvmdumps=1;;
      q)  quick=1;;
      \?) giveUsage=1; break;;
   esac
done

if [ "$giveUsage" -eq 1 ]; then
   # Print out the prologue comments as usage info
   sed -e '/STARTUSAGE/,/ENDUSAGE/ s/^#//' -e '1,/STARTUSAGE/ d' -e '/ENDUSAGE/,$ d' "$0"
   exit 0
fi

# First positional parameter is at index OPTIND.  Shift so that these parms
# are easily accessible to following code.
if [ $OPTIND -ne 1 ]; then
   shift $(($OPTIND-1))
fi

# Make sure we're root
me=$(whoami)                # Current user's login name
if [ $me != "root" ]; then
   echo "You must be logged in as root to run this script; you are currently logged in as $me."
   exit 1
fi

# Names of startup scripts that will continue to run after the HMC has been started.
# These names are used to determine whether we're running on a zHMC or pHMC.
zhmctopscript='initconsole'   # Top-level zSeries HMC startup script
phmctopscript='runccfw'       # Top-level pSeries HMC startup script
topscripts="$zhmctopscript $phmctopscript"

sleepamount='20'  # How long to wait between checks for JVM dumps, in seconds
let sleepcount=15 # Number of times to sleep while waiting for JVM dumps

now=$(date +%Y%m%d.%H%M%S)      # Build a date/time stamp to identify this run
hostname=$(hostname)            # Name of this system

# Names of output files created by this script.  Include hostname and timestamp for uniqueness.
logfn="/tmp/hmcdebuginfo.$hostname.$now.log"
tarfn="/tmp/hmcdebuginfo.$hostname.$now.tgz"
tracebufsfn="/tmp/hmcdebuginfo.$hostname.$now.showTraceBuf"
  
# Determine which HMC (zSeries or pSeries) is running based on the presence of
# their respective top-level script
for topscript in $topscripts; do
   line=$(pstree -lp|grep $topscript)
   if [ -n "$line" ]; then
      break
   fi
done

if [ -z "$line" ]; then
   echo "Unable to find the top-level HMC script ($topscripts) in pstree output."
   echo "The HMC does not appear to be running at this time.  Exiting..."
   exit 2
fi

# Set a bunch of variables to identify the important directories and files we
# want to collect.  Use an iqzddfc.trm lookup whenever possible.
if [ $topscript == $zhmctopscript ]; then  # zSeries HMC is running
   hmctopdir='/console'              # Top-level HMC code/data directory
   export CONSOLE_PATH=$hmctopdir/   # Needed by hmcfunctions
   . $hmctopdir/hmcfunctions         # Get access to common function definitions
   jvmscript='startdriver'           # The script that starts the JVM
   consolelog='/var/log/hmc.log'     # Line mode output from startup scripts and JVM
   platformspecificfiles=''          # Anything specific to this variety of HMC
else                                       # pSeries HMC is running
   hmctopdir='/opt/ccfw'
   export CONSOLE_PATH=$hmctopdir/
   . $hmctopdir/hmcfunctions
   jvmscript='runccfw'
   consolelog="$hmctopdir/ccfw.out"
   platformspecificfiles="$(queryFileLocation iqzdtrac.trm)iqzdtrac.trm.previous" # From startccfw
fi

# More directories/files.  These are the same on both zSeries and zSeries                 
tracefile="$(queryFileLocation iqzdtrac.trm)iqzdtrac.trm"   # Trace file
logfile="$(queryFileLocation iqyylog.log)iqyylog.log" # Log file
compressedlogfiles="$(queryFileLocation iqyycom*.log)iqyycom*.log" # Compressed log files
jvmdumpdir="$(queryFileLocation jvmdumptargetdirectory)" # JVM thread/heap dumps dir
jvmthreaddumps="${jvmdumpdir}javacore*"                  # JVM thread dumps
jvmheapdumps="${jvmdumpdir}heapdump*"                    # JVM heap dumps
corefiles="$(queryFileLocation core.*)core*"             # Core dumps


# Start a new log file and gather some command output
echo "*** Gathering info from host $hostname at date.time $now" > $logfn
echo "*** Output from 'top -bn1' command follows ***" >> $logfn
top -bn1 >> $logfn
echo "*** Output from 'ps -Afww' command follows ***" >> $logfn
ps -Afww >> $logfn
echo "*** Output from 'pstree -lp' command follows ***" >> $logfn
pstree -lp >> $logfn
echo "*** Output from 'free' command follows ***" >> $logfn
free >> $logfn
echo "*** Output from 'netstat -atpn' command follows ***" >> $logfn
netstat -atpn >> $logfn
echo "*** Output from 'ps -Aww -o pid,start_time,etime,%cpu,args' command follows ***" >> $logfn
ps -Aww -o pid,start_time,etime,%cpu,args >> $logfn
echo "*** Output from 'df' command follows ***" >> $logfn
df >> $logfn

# Get all the in-process trace buffers
echo "*** Getting all in-process trace buffers ***" >> $logfn
$hmctopdir/bin/base/showTraceBuf all > $tracebufsfn 2>&1

# Request the JVM to take a thread dump and heap dump, if configured to do so
pid=$(getjvmpid $jvmscript)
if [ -z $pid ]; then
   echo "Unable to find top-level JVM process.  Exiting..." | tee -a $logfn
   exit 3
fi

filesize=$(cat $consolelog | wc -l)  # Number of lines in console log before we signal JVM
echo "About to signal process $pid to create JVM thread/heap dumps." | tee -a $logfn
kill -s sigquit $pid  # Send the JVM a SIGQUIT signal
if [ $? != 0 ]; then
   echo "Attempt to signal PID $pid failed with exit status $?" | tee -a $logfn
   exit 4
else   # Signal sent; wait for thread/heap dump creation
   echo "Signal successfully sent; waiting $sleepamount seconds for dump creation..." | tee -a $logfn
   sleep $sleepamount
   # This code assumes that the JVM is an IBM JVM.  It looks for a particular sequence
   # of JVM messages in the linemode console log.  The sed scripts delete all lines that
   # were in the console log file before the JVM was signalled, and then searches any new
   # lines for JVM message identifiers.  JVMDG217 is issued when the JVM acknowledges the
   # SIGQUIT; JVMDG318 identifies the heapdump file; JVMDG304 identifies the thread dump
   # file; JVMDG215 indicates that the JVM has completed handling the SIGQUIT.  We loop
   # with a delay until the JVM completes the dumps or runs out of time.
   done=0
   let loopcount=0
   while [[ $done -eq 0 && $loopcount -lt $sleepcount ]]; do
      if sed -e "1,$filesize d" $consolelog | grep 'JVMDG215' >/dev/null; then
         echo "JVM dumps are complete." | tee -a $logfn
         # Get the filenames of the heap dump and thread dump from the JVM messages
         heapdump=$(sed -e "1,$filesize d" -e '/JVMDG318/!d' $consolelog | awk '{print $NF}')
         javacore=$(sed -e "1,$filesize d" -e '/JVMDG304/!d' $consolelog | awk '{print $NF}')
         done=1
      else
         if sed -e "1,$filesize d" $consolelog | grep 'JVMDG217' >/dev/null; then # JVM got signal
            echo "Dumps are not yet complete; waiting another $sleepamount seconds..." | tee -a $logfn
         else # JVM has not acknowledged signal yet
            if [ $quick == 1 ]; then    # User does not want to wait for unresponsive JVM
               echo "Not waiting on the JVM any longer; continuing to gather other files..." | tee -a $logfn
               done=1
               break
            else    # User is willing to wait; sleep and check again...
               echo "JVM has not acknowledged signal; waiting another $sleepamount seconds..." | tee -a $logfn
            fi
         fi
         sleep $sleepamount
         let loopcount=$loopcount+1
      fi
   done
   if [ $done -eq 0 ]; then
      echo "Timeout occurred waiting for JVM to create thread/heap dump" | tee -a $logfn
   fi
fi

filestogather="$logfn $tracefile $logfile $compressedlogfiles $consolelog $javacore $heapdump $platformspecificfiles $tracebufsfn"
if [ $alljvmdumps == 1 ]; then # User wants all JVM thread/heap dumps, not just new ones
   filestogather="$filestogather $jvmthreaddumps $jvmheapdumps"
fi

echo "*** Listing of files to gather ***" >> $logfn
ls -l $filestogather >> $logfn 2>&1

echo "*** Listing of other interesting files ***" >> $logfn
otherfiles="$corefiles $hmctopdir/core*"
if [ $alljvmdumps == 0 ]; then
   otherfiles="$otherfiles $jvmthreaddumps $jvmheapdumps"
fi
ls -l $otherfiles >> $logfn 2>&1

echo "About to create tar file containing debugging information..." | tee -a $logfn
tar cvzf $tarfn $filestogather >> $logfn 2>&1
if [ -f $tarfn ]; then
   echo "tar file $tarfn has been created."
else
   echo "Attempt to create tar file $tarfn failed with exit status $?" | tee -a $logfn
   exit 5
fi

rm $logfn

exit 0
